{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "def4b0a5-3417-4ab0-91f7-c54cd9d2af28",
   "metadata": {},
   "source": [
    "# L6: Multimodal Recommender System"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "052891e0",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px\"> ⏳ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8210fd02",
   "metadata": {},
   "source": [
    "* In this classroom, the libraries have been already installed for you.\n",
    "* If you would like to run this code on your own machine, you need to install the following:\n",
    "```\n",
    "    !pip install -U weaviate-client\n",
    "    !pip install google-generativeai\n",
    "    !pip install openai\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37c1e227-8318-47c0-b13c-c93112619c9c",
   "metadata": {
    "height": 47
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8db5bf0c",
   "metadata": {},
   "source": [
    "## Setup\n",
    "### Load environment variables and API keys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "165fde08",
   "metadata": {
    "height": 132
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv, find_dotenv\n",
    "_ = load_dotenv(find_dotenv()) # read local .env file\n",
    "\n",
    "MM_EMBEDDING_API_KEY = os.getenv(\"EMBEDDING_API_KEY\")\n",
    "TEXT_EMBEDDING_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
    "OPENAI_BASEURL = os.getenv(\"OPENAI_BASE_URL\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c77481f-c61e-481d-a562-d7a6ccf50a1c",
   "metadata": {},
   "source": [
    "## Connect to Weaviate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "891a0261-e2e9-4296-9dd5-8cb7436cb938",
   "metadata": {
    "height": 268
   },
   "outputs": [],
   "source": [
    "import weaviate\n",
    "\n",
    "client = weaviate.connect_to_embedded(\n",
    "    version=\"1.24.4\",\n",
    "    environment_variables={\n",
    "        \"ENABLE_MODULES\": \"multi2vec-palm,text2vec-openai\"\n",
    "    },\n",
    "    headers={\n",
    "        \"X-PALM-Api-Key\": MM_EMBEDDING_API_KEY,\n",
    "        \"X-OpenAI-Api-Key\": TEXT_EMBEDDING_API_KEY,\n",
    "        \"X-OpenAI-BaseURL\": OPENAI_BASEURL\n",
    "    }\n",
    ")\n",
    "\n",
    "client.is_ready()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1487e4b9-bcc5-4edc-841e-d7fec106849b",
   "metadata": {},
   "source": [
    "## Create Multivector collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df51c1f6",
   "metadata": {
    "height": 608
   },
   "outputs": [],
   "source": [
    "from weaviate.classes.config import Configure, DataType, Property\n",
    "\n",
    "# client.collections.delete(\"Movies\")\n",
    "client.collections.create(\n",
    "    name=\"Movies\",\n",
    "    properties=[\n",
    "        Property(name=\"title\", data_type=DataType.TEXT),\n",
    "        Property(name=\"overview\", data_type=DataType.TEXT),\n",
    "        Property(name=\"vote_average\", data_type=DataType.NUMBER),\n",
    "        Property(name=\"release_year\", data_type=DataType.INT),\n",
    "        Property(name=\"tmdb_id\", data_type=DataType.INT),\n",
    "        Property(name=\"poster\", data_type=DataType.BLOB),\n",
    "        Property(name=\"poster_path\", data_type=DataType.TEXT),\n",
    "    ],\n",
    "\n",
    "   # Define & configure the vector spaces\n",
    "    vectorizer_config=[\n",
    "        # Vectorize the movie title and overview – for text-based semantic search\n",
    "        Configure.NamedVectors.text2vec_openai(\n",
    "            name=\"txt_vector\",                       # the name of the txt vector space\n",
    "            source_properties=[\"title\", \"overview\"], # text properties to be used for vectorization\n",
    "        ),\n",
    "        \n",
    "        # Vectorize the movie poster – for image-based semantic search\n",
    "        Configure.NamedVectors.multi2vec_palm(\n",
    "            name=\"poster_vector\",                    # the name of the image vector space\n",
    "            image_fields=[\"poster\"],                 # use poster property multivec vectorization\n",
    "            \n",
    "            project_id=\"semi-random-dev\",\n",
    "            location=\"us-central1\",\n",
    "            model_id=\"multimodalembedding@001\",\n",
    "            dimensions=1408,\n",
    "        ),\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7bde584-15f2-4aee-856f-51d87edf01ef",
   "metadata": {},
   "source": [
    "## Load in data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d8a03bb-057c-4fad-9c10-924670122fd3",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px\"> 💻 &nbsp; <b>Access Files and Helper Functions:</b> To access the files for this notebook, 1) click on the <em>\"File\"</em> option on the top menu of the notebook and then 2) click on <em>\"Open\"</em>. For more help, please see the <em>\"Appendix - Tips and Help\"</em> Lesson.</p>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57248009-4ed4-4eff-8b73-6a679c2723c9",
   "metadata": {
    "height": 64
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_json(\"movies_data.json\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cc2f0366",
   "metadata": {},
   "source": [
    "## Helper function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "627e8b36",
   "metadata": {
    "height": 115
   },
   "outputs": [],
   "source": [
    "import base64\n",
    "\n",
    "# Helper function to convert a file to base64 representation\n",
    "def toBase64(path):\n",
    "    with open(path, 'rb') as file:\n",
    "        return base64.b64encode(file.read()).decode('utf-8')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b927fb51",
   "metadata": {},
   "source": [
    "## Import text and image data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bf03b05",
   "metadata": {
    "height": 608
   },
   "outputs": [],
   "source": [
    "from weaviate.util import generate_uuid5\n",
    "\n",
    "movies = client.collections.get(\"Movies\")\n",
    "\n",
    "with movies.batch.rate_limit(20) as batch:\n",
    "    # for index, movie in df.sample(20).iterrows():\n",
    "    for index, movie in df.iterrows():\n",
    "\n",
    "        # In case you run it again - Don't import movies that are already in.\n",
    "        if(movies.data.exists(generate_uuid5(movie.id))):\n",
    "            print(f'{index}: Skipping insert. The movie \"{movie.title}\" is already in the database.')\n",
    "            continue\n",
    "\n",
    "        print(f'{index}: Adding \"{movie.title}\"')\n",
    "\n",
    "        # construct the path to the poster image file\n",
    "        poster_path = f\"./posters/{movie.id}_poster.jpg\"\n",
    "        # generate base64 representation of the poster\n",
    "        posterb64 = toBase64(poster_path)\n",
    "\n",
    "        # Build the object payload\n",
    "        movie_obj = {\n",
    "            \"title\": movie.title,\n",
    "            \"overview\": movie.overview,\n",
    "            \"vote_average\": movie.vote_average,\n",
    "            \"tmdb_id\": movie.id,\n",
    "            \"poster_path\": poster_path,\n",
    "            \"poster\": posterb64\n",
    "        }\n",
    "\n",
    "        # Add object to batch queue\n",
    "        batch.add_object(\n",
    "            properties=movie_obj,\n",
    "            uuid=generate_uuid5(movie.id),\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55e8c13e",
   "metadata": {
    "height": 132
   },
   "outputs": [],
   "source": [
    "# Check for failed objects\n",
    "if len(movies.batch.failed_objects) > 0:\n",
    "    print(f\"Failed to import {len(movies.batch.failed_objects)} objects\")\n",
    "    for failed in movies.batch.failed_objects:\n",
    "        print(f\"e.g. Failed to import object with error: {failed.message}\")\n",
    "else:\n",
    "    print(\"Import complete with no errors\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e439604",
   "metadata": {},
   "source": [
    "## Text-search through the text vector space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc831304",
   "metadata": {
    "height": 234
   },
   "outputs": [],
   "source": [
    "from IPython.display import Image\n",
    "\n",
    "response = movies.query.near_text(\n",
    "    query=\"Movie about lovable cute pets\",\n",
    "    target_vector=\"txt_vector\",  # Search in the txt_vector space\n",
    "    limit=3,\n",
    ")\n",
    "\n",
    "# Inspect the response\n",
    "for item in response.objects:\n",
    "    print(item.properties[\"title\"])\n",
    "    print(item.properties[\"overview\"])\n",
    "    display(Image(item.properties[\"poster_path\"], width=200))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac684e61",
   "metadata": {
    "height": 217
   },
   "outputs": [],
   "source": [
    "# Perform query\n",
    "response = movies.query.near_text(\n",
    "    query=\"Epic super hero\",\n",
    "    target_vector=\"txt_vector\",  # Search in the txt_vector space\n",
    "    limit=3,\n",
    ")\n",
    "\n",
    "# Inspect the response\n",
    "for item in response.objects:\n",
    "    print(item.properties[\"title\"])\n",
    "    print(item.properties[\"overview\"])\n",
    "    display(Image(item.properties[\"poster_path\"], width=200))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87408061",
   "metadata": {},
   "source": [
    "## Text-search through the posters vector space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd193cb2",
   "metadata": {
    "height": 217
   },
   "outputs": [],
   "source": [
    "# Perform query\n",
    "response = movies.query.near_text(\n",
    "    query=\"Movie about lovable cute pets\",\n",
    "    target_vector=\"poster_vector\",  # Search in the poster_vector space\n",
    "    limit=3,\n",
    ")\n",
    "\n",
    "# Inspect the response\n",
    "for item in response.objects:\n",
    "    print(item.properties[\"title\"])\n",
    "    print(item.properties[\"overview\"])\n",
    "    display(Image(item.properties[\"poster_path\"], width=200))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33403bbc",
   "metadata": {
    "height": 217
   },
   "outputs": [],
   "source": [
    "# Perform query\n",
    "response = movies.query.near_text(\n",
    "    query=\"Epic super hero\",\n",
    "    target_vector=\"poster_vector\",  # Search in the poster_vector space\n",
    "    limit=3,\n",
    ")\n",
    "\n",
    "# Inspect the response\n",
    "for item in response.objects:\n",
    "    print(item.properties[\"title\"])\n",
    "    print(item.properties[\"overview\"])\n",
    "    display(Image(item.properties[\"poster_path\"], width=200))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4707d580",
   "metadata": {},
   "source": [
    "## Image-search through the posters vector space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2937f209",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "Image(\"test/spooky.jpg\", width=300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f7676fa",
   "metadata": {
    "height": 200
   },
   "outputs": [],
   "source": [
    "# Perform query\n",
    "response = movies.query.near_image(\n",
    "    near_image=toBase64(\"test/spooky.jpg\"),\n",
    "    target_vector=\"poster_vector\",  # Search in the poster_vector space\n",
    "    limit=3,\n",
    ")\n",
    "\n",
    "# Inspect the response\n",
    "for item in response.objects:\n",
    "    print(item.properties[\"title\"])\n",
    "    display(Image(item.properties[\"poster_path\"], width=200))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c602e02b",
   "metadata": {},
   "source": [
    "#### One more example!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0291f3d",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "Image(\"test/superheroes.png\", width=300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "adb2a31e",
   "metadata": {
    "height": 200
   },
   "outputs": [],
   "source": [
    "# Perform query\n",
    "response = movies.query.near_image(\n",
    "    near_image=toBase64(\"test/superheroes.png\"),\n",
    "    target_vector=\"poster_vector\",  # Search in the poster_vector space\n",
    "    limit=3,\n",
    ")\n",
    "\n",
    "# Inspect the response\n",
    "for item in response.objects:\n",
    "    print(item.properties[\"title\"])\n",
    "    display(Image(item.properties[\"poster_path\"], width=200))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e5e39f5",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "client.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
